{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第8章 分类数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>School</th>\n",
       "      <th>Class</th>\n",
       "      <th>ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Address</th>\n",
       "      <th>Height</th>\n",
       "      <th>Weight</th>\n",
       "      <th>Math</th>\n",
       "      <th>Physics</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>S_1</td>\n",
       "      <td>C_1</td>\n",
       "      <td>1101</td>\n",
       "      <td>M</td>\n",
       "      <td>street_1</td>\n",
       "      <td>173</td>\n",
       "      <td>63</td>\n",
       "      <td>34.0</td>\n",
       "      <td>A+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>S_1</td>\n",
       "      <td>C_1</td>\n",
       "      <td>1102</td>\n",
       "      <td>F</td>\n",
       "      <td>street_2</td>\n",
       "      <td>192</td>\n",
       "      <td>73</td>\n",
       "      <td>32.5</td>\n",
       "      <td>B+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>S_1</td>\n",
       "      <td>C_1</td>\n",
       "      <td>1103</td>\n",
       "      <td>M</td>\n",
       "      <td>street_2</td>\n",
       "      <td>186</td>\n",
       "      <td>82</td>\n",
       "      <td>87.2</td>\n",
       "      <td>B+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>S_1</td>\n",
       "      <td>C_1</td>\n",
       "      <td>1104</td>\n",
       "      <td>F</td>\n",
       "      <td>street_2</td>\n",
       "      <td>167</td>\n",
       "      <td>81</td>\n",
       "      <td>80.4</td>\n",
       "      <td>B-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>S_1</td>\n",
       "      <td>C_1</td>\n",
       "      <td>1105</td>\n",
       "      <td>F</td>\n",
       "      <td>street_4</td>\n",
       "      <td>159</td>\n",
       "      <td>64</td>\n",
       "      <td>84.8</td>\n",
       "      <td>B+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  School Class    ID Gender   Address  Height  Weight  Math Physics\n",
       "0    S_1   C_1  1101      M  street_1     173      63  34.0      A+\n",
       "1    S_1   C_1  1102      F  street_2     192      73  32.5      B+\n",
       "2    S_1   C_1  1103      M  street_2     186      82  87.2      B+\n",
       "3    S_1   C_1  1104      F  street_2     167      81  80.4      B-\n",
       "4    S_1   C_1  1105      F  street_4     159      64  84.8      B+"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "df = pd.read_csv('data/table.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 一、category的创建及其性质\n",
    "### 1. 分类变量的创建\n",
    "#### （a）用Series创建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    a\n",
       "dtype: category\n",
       "Categories (3, object): [a, b, c]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([\"a\", \"b\", \"c\", \"a\"], dtype=\"category\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （b）对DataFrame指定类型创建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A    category\n",
       "B      object\n",
       "dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "temp_df = pd.DataFrame({'A':pd.Series([\"a\", \"b\", \"c\", \"a\"], dtype=\"category\"),'B':list('abcd')})\n",
    "temp_df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （c）利用内置Categorical类型创建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    a\n",
       "dtype: category\n",
       "Categories (3, object): [a, b, c]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat = pd.Categorical([\"a\", \"b\", \"c\", \"a\"], categories=['a','b','c'])\n",
    "pd.Series(cat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （d）利用cut函数创建"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 默认使用区间类型为标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(10, 30], (0, 10], (10, 30], (30, 60], (30, 60]]\n",
       "Categories (3, interval[int64]): [(0, 10] < (10, 30] < (30, 60]]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(np.random.randint(0,60,5), [0,10,30,60])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 可指定字符为标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[10-30, 30-60, 30-60, 10-30, 30-60]\n",
       "Categories (3, object): [0-10 < 10-30 < 30-60]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(np.random.randint(0,60,5), [0,10,30,60], right=False, labels=['0-10','10-30','30-60'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 分类变量的结构\n",
    "#### 一个分类变量包括三个部分，元素值（values）、分类类别（categories）、是否有序（order）\n",
    "#### 从上面可以看出，使用cut函数创建的分类变量默认为有序分类变量\n",
    "#### 下面介绍如何获取或修改这些属性\n",
    "#### （a）describe方法\n",
    "#### 该方法描述了一个分类序列的情况，包括非缺失值个数、元素值类别数（不是分类类别数）、最多次出现的元素及其频数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count     4\n",
       "unique    3\n",
       "top       a\n",
       "freq      2\n",
       "dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(pd.Categorical([\"a\", \"b\", \"c\", \"a\",np.nan], categories=['a','b','c','d']))\n",
    "s.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （b）categories和ordered属性\n",
    "#### 查看分类类别和是否排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['a', 'b', 'c', 'd'], dtype='object')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.cat.ordered"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 类别的修改"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （a）利用set_categories修改\n",
    "#### 修改分类，但本身值不会变化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    NaN\n",
       "1    NaN\n",
       "2      c\n",
       "3    NaN\n",
       "4    NaN\n",
       "dtype: category\n",
       "Categories (2, object): [new_a, c]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(pd.Categorical([\"a\", \"b\", \"c\", \"a\",np.nan], categories=['a','b','c','d']))\n",
    "s.cat.set_categories(['new_a','c'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （b）利用rename_categories修改\n",
    "#### 需要注意的是该方法会把值和分类同时修改"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    new_a\n",
       "1    new_b\n",
       "2    new_c\n",
       "3    new_a\n",
       "4      NaN\n",
       "dtype: category\n",
       "Categories (4, object): [new_a, new_b, new_c, new_d]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(pd.Categorical([\"a\", \"b\", \"c\", \"a\",np.nan], categories=['a','b','c','d']))\n",
    "s.cat.rename_categories(['new_%s'%i for i in s.cat.categories])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 利用字典修改值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    new_a\n",
       "1    new_b\n",
       "2        c\n",
       "3    new_a\n",
       "4      NaN\n",
       "dtype: category\n",
       "Categories (4, object): [new_a, new_b, c, d]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.cat.rename_categories({'a':'new_a','b':'new_b'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （c）利用add_categories添加"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      a\n",
       "1      b\n",
       "2      c\n",
       "3      a\n",
       "4    NaN\n",
       "dtype: category\n",
       "Categories (5, object): [a, b, c, d, e]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(pd.Categorical([\"a\", \"b\", \"c\", \"a\",np.nan], categories=['a','b','c','d']))\n",
    "s.cat.add_categories(['e'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （d）利用remove_categories移除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      a\n",
       "1      b\n",
       "2      c\n",
       "3      a\n",
       "4    NaN\n",
       "dtype: category\n",
       "Categories (3, object): [a, b, c]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(pd.Categorical([\"a\", \"b\", \"c\", \"a\",np.nan], categories=['a','b','c','d']))\n",
    "s.cat.remove_categories(['d'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （e）删除元素值未出现的分类类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      a\n",
       "1      b\n",
       "2      c\n",
       "3      a\n",
       "4    NaN\n",
       "dtype: category\n",
       "Categories (3, object): [a, b, c]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(pd.Categorical([\"a\", \"b\", \"c\", \"a\",np.nan], categories=['a','b','c','d']))\n",
    "s.cat.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 二、分类变量的排序\n",
    "#### 前面提到，分类数据类型被分为有序和无序，这非常好理解，例如分数区间的高低是有序变量，考试科目的类别一般看做无序变量"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 序的建立"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （a）一般来说会将一个序列转为有序变量，可以利用as_ordered方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    d\n",
       "2    c\n",
       "3    a\n",
       "dtype: category\n",
       "Categories (3, object): [a < c < d]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category').cat.as_ordered()\n",
    "s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 退化为无序变量，只需要使用as_unordered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    d\n",
       "2    c\n",
       "3    a\n",
       "dtype: category\n",
       "Categories (3, object): [a, c, d]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.cat.as_unordered()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （b）利用set_categories方法中的order参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    d\n",
       "2    c\n",
       "3    a\n",
       "dtype: category\n",
       "Categories (3, object): [a < c < d]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category').cat.set_categories(['a','c','d'],ordered=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （c）利用reorder_categories方法\n",
    "#### 这个方法的特点在于，新设置的分类必须与原分类为同一集合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    d\n",
       "2    c\n",
       "3    a\n",
       "dtype: category\n",
       "Categories (3, object): [a < c < d]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category')\n",
    "s.cat.reorder_categories(['a','c','d'],ordered=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#s.cat.reorder_categories(['a','c'],ordered=True) #报错\n",
    "#s.cat.reorder_categories(['a','c','d','e'],ordered=True) #报错"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 排序"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 先前在第1章介绍的值排序和索引排序都是适用的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       good\n",
       "1       fair\n",
       "2        bad\n",
       "3    perfect\n",
       "4    perfect\n",
       "dtype: category\n",
       "Categories (5, object): [awful < bad < fair < good < perfect]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series(np.random.choice(['perfect','good','fair','bad','awful'],50)).astype('category')\n",
    "s.cat.set_categories(['perfect','good','fair','bad','awful'][::-1],ordered=True).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "29    perfect\n",
       "17    perfect\n",
       "31    perfect\n",
       "3     perfect\n",
       "4     perfect\n",
       "dtype: category\n",
       "Categories (5, object): [awful, bad, fair, good, perfect]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.sort_values(ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>good</th>\n",
       "      <td>-1.746975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fair</th>\n",
       "      <td>0.836732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bad</th>\n",
       "      <td>0.094912</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>perfect</th>\n",
       "      <td>-0.724338</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>perfect</th>\n",
       "      <td>-1.456362</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            value\n",
       "cat              \n",
       "good    -1.746975\n",
       "fair     0.836732\n",
       "bad      0.094912\n",
       "perfect -0.724338\n",
       "perfect -1.456362"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sort = pd.DataFrame({'cat':s.values,'value':np.random.randn(50)}).set_index('cat')\n",
    "df_sort.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>awful</th>\n",
       "      <td>0.245782</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>awful</th>\n",
       "      <td>0.063991</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>awful</th>\n",
       "      <td>1.541862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>awful</th>\n",
       "      <td>-0.062976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>awful</th>\n",
       "      <td>0.472542</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          value\n",
       "cat            \n",
       "awful  0.245782\n",
       "awful  0.063991\n",
       "awful  1.541862\n",
       "awful -0.062976\n",
       "awful  0.472542"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sort.sort_index().head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 三、分类变量的比较操作"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 与标量或等长序列的比较"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （a）标量比较"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     True\n",
       "1    False\n",
       "2    False\n",
       "3     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category')\n",
    "s == 'a'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （b）等长序列比较"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     True\n",
       "1    False\n",
       "2     True\n",
       "3    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s == list('abcd')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 与另一分类变量的比较"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （a）等式判别（包含等号和不等号）\n",
    "#### 两个分类变量的等式判别需要满足分类完全相同"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    True\n",
       "1    True\n",
       "2    True\n",
       "3    True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category')\n",
    "s == s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s != s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "s_new = s.cat.set_categories(['a','d','e'])\n",
    "#s == s_new #报错"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### （b）不等式判别（包含>=,<=,<,>）\n",
    "#### 两个分类变量的不等式判别需要满足两个条件：① 分类完全相同 ② 排序完全相同"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category')\n",
    "#s >= s #报错"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    True\n",
       "1    True\n",
       "2    True\n",
       "3    True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = pd.Series([\"a\", \"d\", \"c\", \"a\"]).astype('category').cat.reorder_categories(['a','c','d'],ordered=True)\n",
    "s >= s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 四、问题与练习"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 【问题一】 如何使用union_categoricals方法？它的作用是什么？\n",
    "#### 【问题二】 利用concat方法将两个序列纵向拼接，它的结果一定是分类变量吗？什么情况下不是？\n",
    "#### 【问题三】 当使用groupby方法或者value_counts方法时，分类变量的统计结果和普通变量有什么区别？\n",
    "#### 【问题四】 下面的代码说明了Series创建分类变量的什么“缺陷”？如何避免？（提示：使用Series中的copy参数）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 2, 3, 10]\n",
       "Categories (5, int64): [1, 2, 3, 4, 10]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])\n",
    "s = pd.Series(cat, name=\"cat\")\n",
    "cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[10, 10, 3, 10]\n",
       "Categories (5, int64): [1, 2, 3, 4, 10]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.iloc[0:2] = 10\n",
    "cat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 【练习一】 现继续使用第四章中的地震数据集，请解决以下问题：\n",
    "#### （a）现在将深度分为七个等级：[0,5,10,15,20,30,50,np.inf]，请以深度等级Ⅰ,Ⅱ,Ⅲ,Ⅳ,Ⅴ,Ⅵ,Ⅶ为索引并按照由浅到深的顺序进行排序。\n",
    "#### （b）在（a）的基础上，将烈度分为4个等级：[0,3,4,5,np.inf]，依次对南部地区的深度和烈度等级建立多级索引排序。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>时间</th>\n",
       "      <th>维度</th>\n",
       "      <th>经度</th>\n",
       "      <th>方向</th>\n",
       "      <th>距离</th>\n",
       "      <th>深度</th>\n",
       "      <th>烈度</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2003.05.20</td>\n",
       "      <td>12:17:44 AM</td>\n",
       "      <td>39.04</td>\n",
       "      <td>40.38</td>\n",
       "      <td>west</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2007.08.01</td>\n",
       "      <td>12:03:08 AM</td>\n",
       "      <td>40.79</td>\n",
       "      <td>30.09</td>\n",
       "      <td>west</td>\n",
       "      <td>0.1</td>\n",
       "      <td>5.2</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1978.05.07</td>\n",
       "      <td>12:41:37 AM</td>\n",
       "      <td>38.58</td>\n",
       "      <td>27.61</td>\n",
       "      <td>south_west</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1997.03.22</td>\n",
       "      <td>12:31:45 AM</td>\n",
       "      <td>39.47</td>\n",
       "      <td>36.44</td>\n",
       "      <td>south_west</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2000.04.02</td>\n",
       "      <td>12:57:38 AM</td>\n",
       "      <td>40.80</td>\n",
       "      <td>30.24</td>\n",
       "      <td>south_west</td>\n",
       "      <td>0.1</td>\n",
       "      <td>7.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期           时间     维度     经度          方向   距离    深度   烈度\n",
       "0  2003.05.20  12:17:44 AM  39.04  40.38        west  0.1  10.0  0.0\n",
       "1  2007.08.01  12:03:08 AM  40.79  30.09        west  0.1   5.2  4.0\n",
       "2  1978.05.07  12:41:37 AM  38.58  27.61  south_west  0.1   0.0  0.0\n",
       "3  1997.03.22  12:31:45 AM  39.47  36.44  south_west  0.1  10.0  0.0\n",
       "4  2000.04.02  12:57:38 AM  40.80  30.24  south_west  0.1   7.0  0.0"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('data/Earthquake.csv').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 【练习二】 对于分类变量而言，调用第4章中的变形函数会出现一个BUG（目前的版本下还未修复）：例如对于crosstab函数，按照[官方文档的说法](https://pandas.pydata.org/pandas-docs/version/1.0.0/user_guide/reshaping.html#cross-tabulations)，即使没有出现的变量也会在变形后的汇总结果中出现，但事实上并不是这样，比如下面的例子就缺少了原本应该出现的行'c'和列'f'。基于这一问题，请尝试设计my_crosstab函数，在功能上能够返回正确的结果。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>col_0</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>row_0</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "col_0  d  e\n",
       "row_0      \n",
       "a      1  0\n",
       "b      0  1"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])\n",
    "bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])\n",
    "pd.crosstab(foo, bar)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
